{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "sarcasm_preprocess.ipynb",
      "provenance": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/lmoroney/tfbook/blob/master/chapter5/sarcasm_preprocess.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "zX4Kg8DUTKWO",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "# you may not use this file except in compliance with the License.\n",
        "# You may obtain a copy of the License at\n",
        "#\n",
        "# https://www.apache.org/licenses/LICENSE-2.0\n",
        "#\n",
        "# Unless required by applicable law or agreed to in writing, software\n",
        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "# See the License for the specific language governing permissions and\n",
        "# limitations under the License."
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "z3WdEM6vlRXD",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!pip install beautifulsoup4"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "D1J15Vh_1Jih",
        "colab_type": "code",
        "cellView": "both",
        "colab": {}
      },
      "source": [
        "try:\n",
        "  # %tensorflow_version only exists in Colab.\n",
        "  %tensorflow_version 2.x\n",
        "except Exception:\n",
        "  pass\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Ue16ggmfpmCs",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import tensorflow as tf\n",
        "\n",
        "\n",
        "from tensorflow.keras.preprocessing.text import Tokenizer\n",
        "from tensorflow.keras.preprocessing.sequence import pad_sequences"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "WpOdgZnNlWY5",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from bs4 import BeautifulSoup\n",
        "import string\n",
        "\n",
        "stopwords = [\"a\", \"about\", \"above\", \"after\", \"again\", \"against\", \"all\", \"am\", \"an\", \"and\", \"any\", \"are\", \"as\", \"at\",\n",
        "             \"be\", \"because\", \"been\", \"before\", \"being\", \"below\", \"between\", \"both\", \"but\", \"by\", \"could\", \"did\", \"do\",\n",
        "             \"does\", \"doing\", \"down\", \"during\", \"each\", \"few\", \"for\", \"from\", \"further\", \"had\", \"has\", \"have\", \"having\",\n",
        "             \"he\", \"hed\", \"hes\", \"her\", \"here\", \"heres\", \"hers\", \"herself\", \"him\", \"himself\", \"his\", \"how\",\n",
        "             \"hows\", \"i\", \"id\", \"ill\", \"im\", \"ive\", \"if\", \"in\", \"into\", \"is\", \"it\", \"its\", \"itself\",\n",
        "             \"lets\", \"me\", \"more\", \"most\", \"my\", \"myself\", \"nor\", \"of\", \"on\", \"once\", \"only\", \"or\", \"other\", \"ought\",\n",
        "             \"our\", \"ours\", \"ourselves\", \"out\", \"over\", \"own\", \"same\", \"she\", \"shed\", \"shell\", \"shes\", \"should\",\n",
        "             \"so\", \"some\", \"such\", \"than\", \"that\", \"thats\", \"the\", \"their\", \"theirs\", \"them\", \"themselves\", \"then\",\n",
        "             \"there\", \"theres\", \"these\", \"they\", \"theyd\", \"theyll\", \"theyre\", \"theyve\", \"this\", \"those\", \"through\",\n",
        "             \"to\", \"too\", \"under\", \"until\", \"up\", \"very\", \"was\", \"we\", \"wed\", \"well\", \"were\", \"weve\", \"were\",\n",
        "             \"what\", \"whats\", \"when\", \"whens\", \"where\", \"wheres\", \"which\", \"while\", \"who\", \"whos\", \"whom\", \"why\",\n",
        "             \"whys\", \"with\", \"would\", \"you\", \"youd\", \"youll\", \"youre\", \"youve\", \"your\", \"yours\", \"yourself\",\n",
        "             \"yourselves\"]\n",
        "\n",
        "table = str.maketrans('', '', string.punctuation)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BOHL1UZrAATP",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!wget --no-check-certificate \\\n",
        "    https://storage.googleapis.com/learning-datasets/sarcasm.json \\\n",
        "    -O /tmp/sarcasm.json\n",
        "  \n",
        "import json\n",
        "from tensorflow.keras.preprocessing.text import Tokenizer\n",
        "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
        "\n",
        "with open(\"/tmp/sarcasm.json\", 'r') as f:\n",
        "    datastore = json.load(f)\n",
        "\n",
        "\n",
        "sentences = [] \n",
        "labels = []\n",
        "urls = []\n",
        "for item in datastore:\n",
        "    sentence = item['headline'].lower()\n",
        "    sentence = sentence.replace(\",\", \" , \")\n",
        "    sentence = sentence.replace(\".\", \" . \")\n",
        "    sentence = sentence.replace(\"-\", \" - \")\n",
        "    sentence = sentence.replace(\"/\", \" / \")\n",
        "    soup = BeautifulSoup(sentence)\n",
        "    sentence = soup.get_text()\n",
        "    words = sentence.split()\n",
        "    filtered_sentence = \"\"\n",
        "    for word in words:\n",
        "        word = word.translate(table)\n",
        "        if word not in stopwords:\n",
        "            filtered_sentence = filtered_sentence + word + \" \"\n",
        "    sentences.append(filtered_sentence)\n",
        "    labels.append(item['is_sarcastic'])\n",
        "    urls.append(item['article_link'])\n",
        "\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Rtu9POZO0er1",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "print(len(sentences))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Y9TOV7biBQ8-",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "training_size = 23000\n",
        "\n",
        "training_sentences = sentences[0:training_size]\n",
        "testing_sentences = sentences[training_size:]\n",
        "training_labels = labels[0:training_size]\n",
        "testing_labels = labels[training_size:]\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VZloWEQUBF4e",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "vocab_size = 20000\n",
        "max_length = 10\n",
        "trunc_type='post'\n",
        "padding_type='post'\n",
        "oov_tok = \"<OOV>\"\n",
        "\n",
        "tokenizer = Tokenizer(num_words=vocab_size, oov_token=\"<OOV>\")\n",
        "tokenizer.fit_on_texts(training_sentences)\n",
        "\n",
        "word_index = tokenizer.word_index\n",
        "\n",
        "training_sequences = tokenizer.texts_to_sequences(training_sentences)\n",
        "padded = pad_sequences(training_sequences, padding='post')\n",
        "print(word_index)"
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}